Final_CLS_2022_Study_List_Non_Search_model_file <- read_sheet(
"https://docs.google.com/spreadsheets/d/1N48rTeq7md0v8w8pG_8XIiuapPHQAeO5WoWIB3eaceI/edit#gid=1449351377",
sheet = "FinalDataset_2022_Update"
) %>%
mutate(
Significant_Spend =
as.numeric(
case_when(
probability_of_lift >= 0.9 ~ 1,
TRUE ~ 0
)
),
country = case_when(
country == "NA" ~ "US",
TRUE ~ country
),
region_v2 = case_when(
country == "US" ~ "NA",
country == "CA" ~ "NA",
country == "US + CA" ~ "NA",
TRUE ~ region
)
) %>%
filter(channel != "Search") %>%
# filter out studies without reported lifts
filter(exposed != -1) %>%
# filter out google pay study
filter(study_id != "149142217") %>%
# filter out very negative absolute lifts
filter(absolute_lift > -1000) %>%
mutate(
pa = case_when(
pa == "Google Ads" ~ "SMB", # Step 1
pa == "YouTube" & conversion != "Type 256522942 ([MCC] YouTube TV - Web - Trial Start)" ~ "YTMP", # Step 2
pa == "YouTube Premium" ~ "YTMP", # Step 2
conversion == "Type 256522942 ([MCC] YouTube TV - Web - Trial Start)" ~ "YouTube TV", # Step 2
pa == "Cloud" & conversion != "Type 14257803 (Enterprise - Apps - Signup Confirm - Unique)" ~ "Cloud Workspace", # Step 3
pa == "Cloud" & conversion == "Type 14257803 (Enterprise - Apps - Signup Confirm - Unique)" ~ "Cloud GCP", # Step 3
pa == "Project Fi" ~ "Google Fi", # Step 4
pa == "Google Chrome" ~ "Chrome",
TRUE ~ pa
)
) %>%
mutate(
parsed_type = parse_number(conversion),
grouped_conversion = case_when(
conversion %in% c("Chromebook Microsite Referral Clicks Q4 2015", "Type 251422729 (Chromebooks Microsite Referral Clicks (Q4 2017))") ~ "Chromebook Referrals",
conversion %in% c("Desktop Downloads", "Type 11541547 (Desktop Download)") ~
"Desktop Downloads",
pa == "Pixel" ~ "Mobile Conversions",
pa == "DSM" ~ "Non-Mobile Device Conversions",
conversion == "Type 302982954 (Lena - P Lead)" ~ "Lena P Lead",
conversion == "Type 288347008 (LENA - B Lead)" ~ "Lena B Lead",
conversion == "Type 288697653 (LENA - Q Lead)" ~ "Lena Q Lead",
parsed_type %in% c(181283993, 855508686) ~ "Workspace Free Trial Start",
parsed_type == 330755641 ~ "Microsite Conversions",
parsed_type == 14257803 ~ "Enterprise Signups",
parsed_type == 289680712 ~ "Google(iOs) First Open",
parsed_type == 256522942 ~ "YouTube TV - Web - Trial Start",
parsed_type %in% c(452391534, 221497833, 277150074) ~ "Trial Signups Complete",
TRUE ~ conversion
),
pa = case_when(
conversion == "Type 288697653 (LENA - Q Lead)" ~ "SMB-QLead",
TRUE ~ pa
)
) %>%
filter(absolute_lift > 0)
# all.equal(Final_CLS_2022_Study_List_Non_Search_model_file,Final_CLS_2022_Study_List_Non_Search_v3)file.sources <- list.files(path = "RScripts/", pattern = "*.R", full.names = TRUE)
sapply(file.sources, source, .GlobalEnv) RScripts/best_ind_function.R RScripts/export_rplots_function.R RScripts/export_rplots_function2.R
value ? ? ?
visible FALSE FALSE FALSE
RScripts/graphing_function.R RScripts/graphing_function_elasticnet.R RScripts/graphing_function_rlm.R
value ? ? ?
visible FALSE FALSE FALSE
RScripts/graphing_function2.R RScripts/graphing_function3.R RScripts/graphing_function4.R
value ? ? ?
visible FALSE FALSE FALSE
RScripts/graphing_function4_w_anom.R RScripts/model_wrapper_function.R RScripts/model_wrapper_function2.R
value ? ? ?
visible FALSE FALSE FALSE
RScripts/named_group_split.R RScripts/names_function.R RScripts/ridge_lasso_function.R RScripts/ridge_lasso_function2.R
value ? ? ? ?
visible FALSE FALSE FALSE FALSE
RScripts/ridge_lasso_function4.R RScripts/rlm_function.R RScripts/rlm_function2.R
value ? ? ?
visible FALSE FALSE FALSE
### powers to try
powers <- seq(0.1, 0.9, by = 0.01)
powers2 <- 1
### Powers to Try
#powers <- seq(0.1, 0.9, by = 0.01)
#powers2 <-seq(1.5,3, by = 0.25)
### Lambda parameters
parameters <- c(
# seq(0.1, 2, by =0.1) , seq(2, 5, 0.5) ,
seq(5, 29, 1)
,seq(30, 102, 4)
,seq(110, 1000, 15)
,seq(1000, 10020, 500)
)
### elasticnet parameters
alpha_parameters <- c(seq(0, 1, 0.25))
# For Testing Purposes
#alpha_parameters <- c(seq(1, 1, 1))
start_time <- Sys.time()
Final_CLS_2022_Study_List_Non_Search_model_file_chrome_pre <-
Final_CLS_2022_Study_List_Non_Search_model_file %>%
filter(pa == "Chrome") %>%
mutate(
id2 = row_number()
)
df_test <-
Final_CLS_2022_Study_List_Non_Search_model_file_chrome_pre %>%
select(
region_v2, country, channel, tactic,
cost_spent_on_exposed_group:absolute_lift
)
iso_chrome <- isolationForest$new(sample_size = nrow(df_test), num_trees = 10000, seed = 1153)
iso_chrome$fit(df_test)
scores_train <- df_test %>%
iso_chrome$predict() %>%
arrange(desc(anomaly_score))
Final_CLS_2022_Study_List_Non_Search_model_file_chrome_pre2 <-
Final_CLS_2022_Study_List_Non_Search_model_file_chrome_pre %>%
left_join(scores_train, by = c("id2" = "id")) %>%
filter(average_depth > 3)
Final_CLS_2022_Study_List_Non_Search_model_file_chrome <-
Final_CLS_2022_Study_List_Non_Search_model_file_chrome_pre2 %>%
named_group_split(tactic)
fits_non_search_chrome <- model_wrapper_function(df = Final_CLS_2022_Study_List_Non_Search_model_file_chrome,poly_ind = 0)
best_ind_non_search_chrome <-
lapply(1:length(Final_CLS_2022_Study_List_Non_Search_model_file_chrome), best_ind_function,df = fits_non_search_chrome,
df2 = Final_CLS_2022_Study_List_Non_Search_model_file_chrome)
coef_non_search_chrome <- best_ind_non_search_chrome %>% bind_rows #make a matrix of all coefs
best_fit_non_search_chrome <- best_ind_non_search_chrome %>%
set_names(names_function(Final_CLS_2022_Study_List_Non_Search_model_file_chrome_pre, tactic)) graph_list_chrome <- lapply(1:length(best_fit_non_search_chrome), graphing_function4, df1 = best_fit_non_search_chrome, df2 = Final_CLS_2022_Study_List_Non_Search_model_file_chrome)end_time <- Sys.time()
time_chrome = end_time - start_time
time_chromestart_time <- Sys.time()
Final_CLS_2022_Study_List_Non_Search_model_file_cloud_pre <-
Final_CLS_2022_Study_List_Non_Search_model_file %>%
filter(pa %in% c("Cloud GCP", "Cloud Workspace")) %>%
mutate(
pa = "Cloud",
pa2 = "Cloud - All Channel"
) %>%
mutate(
id2 = row_number()
)
df_test <-
Final_CLS_2022_Study_List_Non_Search_model_file_cloud_pre %>%
# select(-study_id, -id2, -region, -scaling_factor, -quarter, -pa, -study_name)
select(
region_v2, country, channel, tactic,
# treatment_user_count:control,
cost_spent_on_exposed_group:absolute_lift, parsed_type
)
iso_cloud <- isolationForest$new(sample_size = nrow(df_test), num_trees = 10000, seed = 1153)
iso_cloud$fit(df_test)
scores_train <- df_test %>%
iso_cloud$predict() %>%
arrange(desc(anomaly_score))
Final_CLS_2022_Study_List_Non_Search_model_file_cloud_pre2 <-
Final_CLS_2022_Study_List_Non_Search_model_file_cloud_pre %>%
left_join(scores_train, by = c("id2" = "id"))
Final_CLS_2022_Study_List_Non_Search_model_file_cloud <-
Final_CLS_2022_Study_List_Non_Search_model_file_cloud_pre2 %>%
named_group_split(pa2)fits_non_search_cloud <- model_wrapper_function(df = Final_CLS_2022_Study_List_Non_Search_model_file_cloud,poly_ind = 0)
best_ind_non_search_cloud <-
lapply(1:length(Final_CLS_2022_Study_List_Non_Search_model_file_cloud), best_ind_function,df = fits_non_search_cloud,
df2 = Final_CLS_2022_Study_List_Non_Search_model_file_cloud)
coef_non_search_cloud <- best_ind_non_search_cloud %>% bind_rows #make a matrix of all coefs
best_fit_non_search_cloud <- best_ind_non_search_cloud %>%
set_names(names_function(Final_CLS_2022_Study_List_Non_Search_model_file_cloud_pre, pa2)) graph_list_cloud <- lapply(1:length(best_fit_non_search_cloud), graphing_function4, df1 = best_fit_non_search_cloud, df2 = Final_CLS_2022_Study_List_Non_Search_model_file_cloud)end_time <- Sys.time()
time_cloud = end_time - start_time
start_time <- Sys.time()
Final_CLS_2022_Study_List_Non_Search_model_file_youtube_pre <-
Final_CLS_2022_Study_List_Non_Search_model_file %>%
filter(pa %in% c("YouTube TV", "YTMP")) %>%
mutate(
pa = "YouTube",
pa2 = "YouTube"
) %>%
# filter(absolute_lift < 5000) %>%
mutate(
id2 = row_number()
)
df_test <-
Final_CLS_2022_Study_List_Non_Search_model_file_youtube_pre %>%
# select(-study_id, -id2, -region, -scaling_factor, -quarter, -pa, -study_name)
select(
region_v2, country, channel, tactic,
# treatment_user_count:control,
cost_spent_on_exposed_group:absolute_lift, parsed_type
)
iso_yt <- isolationForest$new(sample_size = nrow(df_test), num_trees = 10000, seed = 1153)
iso_yt$fit(df_test)
scores_train <- df_test %>%
iso_yt$predict() %>%
arrange(desc(anomaly_score))
Final_CLS_2022_Study_List_Non_Search_model_file_youtube_pre2 <-
Final_CLS_2022_Study_List_Non_Search_model_file_youtube_pre %>%
left_join(scores_train, by = c("id2" = "id")) %>%
filter(average_depth > 3.89)
Final_CLS_2022_Study_List_Non_Search_model_file_youtube <-
Final_CLS_2022_Study_List_Non_Search_model_file_youtube_pre2 %>%
named_group_split(region_v2)fits_non_search_youtube <- model_wrapper_function(df = Final_CLS_2022_Study_List_Non_Search_model_file_youtube,poly_ind = 0)
best_ind_non_search_youtube <-
lapply(1:length(Final_CLS_2022_Study_List_Non_Search_model_file_youtube), best_ind_function,df = fits_non_search_youtube,
df2 = Final_CLS_2022_Study_List_Non_Search_model_file_youtube)
coef_non_search_youtube <- best_ind_non_search_youtube %>% bind_rows #make a matrix of all coefs
best_fit_non_search_youtube <- best_ind_non_search_youtube %>%
set_names(names_function(Final_CLS_2022_Study_List_Non_Search_model_file_youtube_pre, pa2)) graph_list_youtube <- lapply(1:length(best_fit_non_search_youtube), graphing_function4, df1 = best_fit_non_search_youtube, df2 = Final_CLS_2022_Study_List_Non_Search_model_file_youtube)end_time <- Sys.time()
time_youtube = end_time - start_time
start_time <- Sys.time()
Final_CLS_2022_Study_List_Non_Search_model_file_dsm_pre <-
Final_CLS_2022_Study_List_Non_Search_model_file %>%
filter(pa == "DSM") %>%
filter(region_v2 != "APAC") %>%
# filter(absolute_lift < 1000) # %>%
# filter(study_id != '6297420') #%>%
# filter(study_id !='149161711') %>%
# filter(study_id != '148613002') %>%
# filter(study_id !='3284625') %>%
# filter(study_id !='3329131')
mutate(
id2 = row_number()
)
df_test <-
Final_CLS_2022_Study_List_Non_Search_model_file_dsm_pre %>%
# select(-study_id, -id2, -region, -scaling_factor, -quarter, -pa, -study_name)
select(
region_v2, country, channel, tactic,
# treatment_user_count:control,
cost_spent_on_exposed_group:absolute_lift
)
iso_dsm <- isolationForest$new(sample_size = nrow(df_test), num_trees = 10000, seed = 1152)
iso_dsm$fit(df_test)
scores_train <- df_test %>%
iso_dsm$predict() %>%
arrange(desc(anomaly_score))
Final_CLS_2022_Study_List_Non_Search_model_file_dsm_pre2 <-
Final_CLS_2022_Study_List_Non_Search_model_file_dsm_pre %>%
left_join(scores_train, by = c("id2" = "id")) %>%
filter(average_depth > 5)
Final_CLS_2022_Study_List_Non_Search_model_file_dsm <-
Final_CLS_2022_Study_List_Non_Search_model_file_dsm_pre2 %>%
named_group_split(region_v2, channel)fits_non_search_dsm <- model_wrapper_function(df = Final_CLS_2022_Study_List_Non_Search_model_file_dsm,poly_ind = 0)
best_ind_non_search_dsm <-
lapply(1:length(Final_CLS_2022_Study_List_Non_Search_model_file_dsm), best_ind_function,df = fits_non_search_dsm,
df2 = Final_CLS_2022_Study_List_Non_Search_model_file_dsm)
coef_non_search_dsm <- best_ind_non_search_dsm %>% bind_rows #make a matrix of all coefs
best_fit_non_search_dsm <- best_ind_non_search_dsm %>%
set_names(names_function(Final_CLS_2022_Study_List_Non_Search_model_file_dsm_pre, region_v2,channel)) graph_list_dsm <- lapply(1:length(best_fit_non_search_dsm), graphing_function4, df1 = best_fit_non_search_dsm, df2 = Final_CLS_2022_Study_List_Non_Search_model_file_dsm)end_time <- Sys.time()
time_dsm = end_time - start_time
start_time <- Sys.time()
Final_CLS_2022_Study_List_Non_Search_model_file_pixel_pre <-
Final_CLS_2022_Study_List_Non_Search_model_file %>%
filter(pa == "Pixel") %>%
mutate(
pa2 = "Pixel - All Channel"
) %>%
# filter(absolute_lift < 1000) %>%
# filter(study_id != '6297420') #%>%
# filter(study_id !='149161711') %>%
# filter(study_id != '148613002') %>%
# filter(study_id !='3284625') %>%
# filter(study_id !='3329131')
mutate(
id2 = row_number()
)
df_test <-
Final_CLS_2022_Study_List_Non_Search_model_file_pixel_pre %>%
# select(-study_id, -id2, -region, -scaling_factor, -quarter, -pa, -study_name)
select(
region_v2, country, channel, tactic,
# treatment_user_count:control,
cost_spent_on_exposed_group:absolute_lift
)
iso_pixel <- isolationForest$new(sample_size = nrow(df_test), num_trees = 10000, seed = 1152)
iso_pixel$fit(df_test)
scores_train <- df_test %>%
iso_pixel$predict() %>%
arrange(desc(anomaly_score))
Final_CLS_2022_Study_List_Non_Search_model_file_pixel_pre2 <-
Final_CLS_2022_Study_List_Non_Search_model_file_pixel_pre %>%
left_join(scores_train, by = c("id2" = "id")) %>%
filter(average_depth > 3.1)
Final_CLS_2022_Study_List_Non_Search_model_file_pixel_pre2
Final_CLS_2022_Study_List_Non_Search_model_file_pixel <-
Final_CLS_2022_Study_List_Non_Search_model_file_pixel_pre2 %>%
named_group_split(pa2)fits_non_search_pixel <- model_wrapper_function(df = Final_CLS_2022_Study_List_Non_Search_model_file_pixel,poly_ind = 0)
best_ind_non_search_pixel <-
lapply(1:length(Final_CLS_2022_Study_List_Non_Search_model_file_pixel), best_ind_function,df = fits_non_search_pixel,
df2 = Final_CLS_2022_Study_List_Non_Search_model_file_pixel)
coef_non_search_pixel <- best_ind_non_search_pixel %>% bind_rows #make a matrix of all coefs
best_fit_non_search_pixel <- best_ind_non_search_pixel %>%
set_names(names_function(Final_CLS_2022_Study_List_Non_Search_model_file_pixel_pre, pa2)) graph_list_pixel <- lapply(1:length(best_fit_non_search_pixel), graphing_function4, df1 = best_fit_non_search_pixel, df2 = Final_CLS_2022_Study_List_Non_Search_model_file_pixel)end_time <- Sys.time()
time_pixel = end_time - start_time
start_time <- Sys.time()
Final_CLS_2022_Study_List_Non_Search_model_file_fi_pre <-
Final_CLS_2022_Study_List_Non_Search_model_file %>%
filter(pa == "Google Fi") %>%
mutate(
pa2 = "Fi - All Channel"
) %>%
# filter(absolute_lift < 1000) %>%
# filter(study_id != '6297420') #%>%
# filter(study_id !='149161711') %>%
# filter(study_id != '148613002') %>%
# filter(study_id !='3284625') %>%
# filter(study_id !='3329131')
mutate(
id2 = row_number()
)
df_test <-
Final_CLS_2022_Study_List_Non_Search_model_file_fi_pre %>%
# select(-study_id, -id2, -region, -scaling_factor, -quarter, -pa, -study_name)
select(
region_v2, country, channel, tactic,
# treatment_user_count:control,
cost_spent_on_exposed_group:absolute_lift
)
iso_fi <- isolationForest$new(sample_size = nrow(df_test), num_trees = 10000, seed = 1152)
iso_fi$fit(df_test)
scores_train <- df_test %>%
iso_fi$predict() %>%
arrange(desc(anomaly_score))
Final_CLS_2022_Study_List_Non_Search_model_file_fi_pre2 <-
Final_CLS_2022_Study_List_Non_Search_model_file_fi_pre %>%
left_join(scores_train, by = c("id2" = "id")) %>%
filter(average_depth > 4.75)
Final_CLS_2022_Study_List_Non_Search_model_file_fi_pre2
Final_CLS_2022_Study_List_Non_Search_model_file_fi <-
Final_CLS_2022_Study_List_Non_Search_model_file_fi_pre2 %>%
named_group_split(channel)fits_non_search_fi <- model_wrapper_function(df = Final_CLS_2022_Study_List_Non_Search_model_file_fi,poly_ind = 0)
best_ind_non_search_fi <-
lapply(1:length(Final_CLS_2022_Study_List_Non_Search_model_file_fi), best_ind_function,df = fits_non_search_fi,
df2 = Final_CLS_2022_Study_List_Non_Search_model_file_fi)
coef_non_search_fi <- best_ind_non_search_fi %>% bind_rows #make a matrix of all coefs
best_fit_non_search_fi <- best_ind_non_search_fi %>%
set_names(names_function(Final_CLS_2022_Study_List_Non_Search_model_file_fi_pre, pa2)) graph_list_fi <- lapply(1:length(best_fit_non_search_fi), graphing_function4, df1 = best_fit_non_search_fi, df2 = Final_CLS_2022_Study_List_Non_Search_model_file_fi)end_time <- Sys.time()
time_fi = end_time - start_time
start_time <- Sys.time()
Final_CLS_2022_Study_List_Non_Search_model_file_smbq_pre <-
Final_CLS_2022_Study_List_Non_Search_model_file %>%
filter(grouped_conversion == 'Lena Q Lead') %>%
mutate(
pa2 = "SMB - Q-Lead"
) %>%
# filter(absolute_lift < 1000) %>%
# filter(study_id != '6297420') #%>%
# filter(study_id !='149161711') %>%
# filter(study_id != '148613002') %>%
# filter(study_id !='3284625') %>%
# filter(study_id !='3329131')
mutate(
id2 = row_number()
)
df_test <-
Final_CLS_2022_Study_List_Non_Search_model_file_smbq_pre %>%
# select(-study_id, -id2, -region, -scaling_factor, -quarter, -pa, -study_name)
select(
region_v2, country, channel, tactic,
# treatment_user_count:control,
cost_spent_on_exposed_group:absolute_lift
)
iso_smbq <- isolationForest$new(sample_size = nrow(df_test), num_trees = 10000, seed = 1152)
iso_smbq$fit(df_test)
scores_train <- df_test %>%
iso_smbq$predict() %>%
arrange(desc(anomaly_score))
Final_CLS_2022_Study_List_Non_Search_model_file_smbq_pre2 <-
Final_CLS_2022_Study_List_Non_Search_model_file_smbq_pre %>%
left_join(scores_train, by = c("id2" = "id")) %>%
filter(average_depth > 1)
Final_CLS_2022_Study_List_Non_Search_model_file_smbq <-
Final_CLS_2022_Study_List_Non_Search_model_file_smbq_pre2 %>%
named_group_split(pa2)fits_non_search_smbq <- model_wrapper_function(df = Final_CLS_2022_Study_List_Non_Search_model_file_smbq,poly_ind = 0)
best_ind_non_search_smbq <-
lapply(1:length(Final_CLS_2022_Study_List_Non_Search_model_file_smbq), best_ind_function,df = fits_non_search_smbq,
df2 = Final_CLS_2022_Study_List_Non_Search_model_file_smbq)
coef_non_search_smbq <- best_ind_non_search_smbq %>% bind_rows #make a matrix of all coefs
best_fit_non_search_smbq <- best_ind_non_search_smbq %>%
set_names(names_function(Final_CLS_2022_Study_List_Non_Search_model_file_smbq_pre, pa2)) graph_list_smbq <- lapply(1:length(best_fit_non_search_smbq), graphing_function4, df1 = best_fit_non_search_smbq, df2 = Final_CLS_2022_Study_List_Non_Search_model_file_smbq)end_time <- Sys.time()
time_smbq = end_time - start_time
start_time <- Sys.time()
Final_CLS_2022_Study_List_Non_Search_model_file_smbb_pre <-
Final_CLS_2022_Study_List_Non_Search_model_file %>%
filter(pa == "SMB" & grouped_conversion == 'Lena B Lead') %>%
mutate(
pa2 = "SMB - B-Lead"
) %>%
# filter(absolute_lift < 1000) %>%
# filter(study_id != '6297420') #%>%
# filter(study_id !='149161711') %>%
# filter(study_id != '148613002') %>%
# filter(study_id !='3284625') %>%
# filter(study_id !='3329131')
mutate(
id2 = row_number()
)
df_test <-
Final_CLS_2022_Study_List_Non_Search_model_file_smbb_pre %>%
# select(-study_id, -id2, -region, -scaling_factor, -quarter, -pa, -study_name)
select(
region_v2, country, channel, tactic,
# treatment_user_count:control,
cost_spent_on_exposed_group:absolute_lift
)
iso_smbb <- isolationForest$new(sample_size = nrow(df_test), num_trees = 10000, seed = 1152)
iso_smbb$fit(df_test)
scores_train <- df_test %>%
iso_smbb$predict() %>%
arrange(desc(anomaly_score))
Final_CLS_2022_Study_List_Non_Search_model_file_smbb_pre2 <-
Final_CLS_2022_Study_List_Non_Search_model_file_smbb_pre %>%
left_join(scores_train, by = c("id2" = "id")) %>%
filter(average_depth > 4)
Final_CLS_2022_Study_List_Non_Search_model_file_smbb <-
Final_CLS_2022_Study_List_Non_Search_model_file_smbb_pre2 %>%
named_group_split(channel)fits_non_search_smbb <- model_wrapper_function(df = Final_CLS_2022_Study_List_Non_Search_model_file_smbb,poly_ind = 0)
best_ind_non_search_smbb <-
lapply(1:length(Final_CLS_2022_Study_List_Non_Search_model_file_smbb), best_ind_function,df = fits_non_search_smbb,
df2 = Final_CLS_2022_Study_List_Non_Search_model_file_smbb)
coef_non_search_smbb <- best_ind_non_search_smbb %>% bind_rows #make a matrix of all coefs
best_fit_non_search_smbb <- best_ind_non_search_smbb %>%
set_names(names_function(Final_CLS_2022_Study_List_Non_Search_model_file_smbb_pre, channel)) graph_list_smbb <- lapply(1:length(best_fit_non_search_smbb), graphing_function4, df1 = best_fit_non_search_smbb, df2 = Final_CLS_2022_Study_List_Non_Search_model_file_smbb)end_time <- Sys.time()
time_smbb = end_time - start_timegraph_names <- mget(ls(pat = 'graph_list_'))
df_names <- mget(setdiff(ls(pattern = 'Final_CLS_2022_Study_List_Non_Search_model_file_'), ls(pattern = "pre")))
#rm(Final_CLS_2022_Study_List_Non_Search_model_file_Chrome,Final_CLS_2022_Study_List_Non_Search_model_file_Cloud,Final_CLS_2022_Study_List_Non_Search_model_file_YouTube)
#lapply(1:length(graph_names),
# function(j) {
#lapply(1:length(df_names[[j]]),export_rplots_function2,starting_name = "Non_Search_",folder_name = folder_name,df_list = #df_names[[j]],graphing_list = graph_names[j][[1]])
# }
# )graph_list.fi <- lapply(1:length(best_fit_non_search_fi), graphing_function4_w_anom, df1 = best_fit_non_search_fi, df2 = Final_CLS_2022_Study_List_Non_Search_model_file_fi)
### Add GG Text Repel
ggplotly(graph_list.fi[[3]])
start_time <- Sys.time()
fits.non.search.RIDGE_LASSO <- lapply(
1:length(df_names),
function(i) {
model_wrapper_function(df = df_names[i][[1]],poly_ind = 0)
}
)
start_time <- Sys.time()
fits.non.search.RLM <- lapply(
1:length(df_names),
function(i) {
model_wrapper_function2(df = df_names[i][[1]])
}
)
end_time <- Sys.time()
combined_rlm_time <- start_time - end_time
best.ind.non.search.RLM <- lapply(
1:length(df_names),
function(i) {
lapply(1:length(df_names[i][[1]]), best_ind_function,df = fits.non.search.RLM[i][[1]],
df2 = df_names[i][[1]])
}
)
coef.non.search.RLM <- lapply(
1:length(df_names),
function (i){
best.ind.non.search.RLM[i][[1]] %>% bind_rows
}
) %>%
bind_rows() %>%
as.data.frame() %>%
mutate(
cost_p2 = 0,
lambda = 0,
alpha = 0,
powers2 = 0
) %>%
select(one_of(colnames(coef.2_matrix)))
best.fit.non.search.RLM <- lapply(1:length(df_names),
function(j) {
lapply(1:length(best.ind.non.search.RLM[[j]]),
function(i){
best.ind.non.search.RLM[j][[1]][i] %>%
set_names(nm = best.ind.non.search.RLM[j][[1]][[i]]["model"])
}
)
}
)
-combined_ridge_time+combined_rlm_timeTime difference of 32.768 mins
graph.list.rlm <- lapply(1:length(df_names),
function(i){
lapply(1:length(best.fit.non.search.RLM[i][[1]]), graphing_function_rlm, df1= best.fit.non.search.RLM[i],df2 = df_names[i])
}
)
graph.list.RIDGE_LASSO <- lapply(1:length(df_names),
function(i){
lapply(1:length(best.fit.non.search.RIDGE_LASSO[i][[1]]), graphing_function_elasticnet, df1= best.fit.non.search.RIDGE_LASSO[i],df2 = df_names[i])
}
)
folder_name1 <- paste0("Output/", "outputfiles_", Sys.Date(), "_", "RLM", "/")
dir.create(folder_name1) # it will throw a warning if folder existsWarning in dir.create(folder_name1) :
'Output\outputfiles_2022-11-08_RLM' already exists
lapply(1:length(df_names),
function(j) {
lapply(1:length(df_names[[j]]),export_rplots_function2,starting_name = "Non_Search_",folder_name = folder_name1,df_list = df_names[[j]],graphing_list = graph.list.rlm[j][[1]])
}
)Saving 18.8 x 12.5 in image
Saving 18.8 x 12.5 in image
Saving 18.8 x 12.5 in image
Saving 18.8 x 12.5 in image
Saving 18.8 x 12.5 in image
Saving 18.8 x 12.5 in image
Saving 18.8 x 12.5 in image
Saving 18.8 x 12.5 in image
Saving 18.8 x 12.5 in image
Saving 18.8 x 12.5 in image
Saving 18.8 x 12.5 in image
Saving 18.8 x 12.5 in image
Saving 18.8 x 12.5 in image
Saving 18.8 x 12.5 in image
Saving 18.8 x 12.5 in image
Saving 18.8 x 12.5 in image
Saving 18.8 x 12.5 in image
Saving 18.8 x 12.5 in image
Saving 18.8 x 12.5 in image
Saving 18.8 x 12.5 in image
Saving 18.8 x 12.5 in image
[[1]]
[[1]][[1]]
[1] "Output/outputfiles_2022-11-08_RLM/Non_Search_Chrome_All_Channel.png"
[[1]][[2]]
[1] "Output/outputfiles_2022-11-08_RLM/Non_Search_Chrome_non-REMK.png"
[[1]][[3]]
[1] "Output/outputfiles_2022-11-08_RLM/Non_Search_Chrome_REMK.png"
[[2]]
[[2]][[1]]
[1] "Output/outputfiles_2022-11-08_RLM/Non_Search_Cloud_Cloud_-_All_Channel.png"
[[3]]
[[3]][[1]]
[1] "Output/outputfiles_2022-11-08_RLM/Non_Search_DSM_EMEA__DISCOVERY.png"
[[3]][[2]]
[1] "Output/outputfiles_2022-11-08_RLM/Non_Search_DSM_EMEA__DISPLAY.png"
[[3]][[3]]
[1] "Output/outputfiles_2022-11-08_RLM/Non_Search_DSM_EMEA__YOUTUBE.png"
[[3]][[4]]
[1] "Output/outputfiles_2022-11-08_RLM/Non_Search_DSM_NA__DISCOVERY.png"
[[3]][[5]]
[1] "Output/outputfiles_2022-11-08_RLM/Non_Search_DSM_NA__DISPLAY.png"
[[3]][[6]]
[1] "Output/outputfiles_2022-11-08_RLM/Non_Search_DSM_NA__YOUTUBE.png"
[[4]]
[[4]][[1]]
[1] "Output/outputfiles_2022-11-08_RLM/Non_Search_Google_Fi_DISCOVERY.png"
[[4]][[2]]
[1] "Output/outputfiles_2022-11-08_RLM/Non_Search_Google_Fi_DISPLAY.png"
[[4]][[3]]
[1] "Output/outputfiles_2022-11-08_RLM/Non_Search_Google_Fi_YOUTUBE.png"
[[5]]
[[5]][[1]]
[1] "Output/outputfiles_2022-11-08_RLM/Non_Search_Pixel_Pixel_-_All_Channel.png"
[[6]]
[[6]][[1]]
[1] "Output/outputfiles_2022-11-08_RLM/Non_Search_SMB_DISCOVERY.png"
[[6]][[2]]
[1] "Output/outputfiles_2022-11-08_RLM/Non_Search_SMB_DISPLAY.png"
[[6]][[3]]
[1] "Output/outputfiles_2022-11-08_RLM/Non_Search_SMB_YOUTUBE.png"
[[7]]
[[7]][[1]]
[1] "Output/outputfiles_2022-11-08_RLM/Non_Search_SMB-QLead_SMB_-_Q-Lead.png"
[[8]]
[[8]][[1]]
[1] "Output/outputfiles_2022-11-08_RLM/Non_Search_YouTube_APAC.png"
[[8]][[2]]
[1] "Output/outputfiles_2022-11-08_RLM/Non_Search_YouTube_EMEA.png"
[[8]][[3]]
[1] "Output/outputfiles_2022-11-08_RLM/Non_Search_YouTube_NA.png"
folder_name2 <- paste0("Output/", "outputfiles_", Sys.Date(), "_", "ElasticNet", "/")
dir.create(folder_name2) # it will throw a warning if folder existsWarning in dir.create(folder_name2) :
'Output\outputfiles_2022-11-08_ElasticNet' already exists
lapply(1:length(df_names),
function(j) {
lapply(1:length(df_names[[j]]),export_rplots_function2,starting_name = "Non_Search_",folder_name = folder_name2,df_list = df_names[[j]],graphing_list = graph.list.RIDGE_LASSO[j][[1]])
}
)Saving 18.8 x 12.5 in image
Saving 18.8 x 12.5 in image
Saving 18.8 x 12.5 in image
Saving 18.8 x 12.5 in image
Saving 18.8 x 12.5 in image
Saving 18.8 x 12.5 in image
Saving 18.8 x 12.5 in image
Saving 18.8 x 12.5 in image
Saving 18.8 x 12.5 in image
Saving 18.8 x 12.5 in image
Saving 18.8 x 12.5 in image
Saving 18.8 x 12.5 in image
Saving 18.8 x 12.5 in image
Saving 18.8 x 12.5 in image
Saving 18.8 x 12.5 in image
Saving 18.8 x 12.5 in image
Saving 18.8 x 12.5 in image
Saving 18.8 x 12.5 in image
Saving 18.8 x 12.5 in image
Saving 18.8 x 12.5 in image
Saving 18.8 x 12.5 in image
[[1]]
[[1]][[1]]
[1] "Output/outputfiles_2022-11-08_ElasticNet/Non_Search_Chrome_All_Channel.png"
[[1]][[2]]
[1] "Output/outputfiles_2022-11-08_ElasticNet/Non_Search_Chrome_non-REMK.png"
[[1]][[3]]
[1] "Output/outputfiles_2022-11-08_ElasticNet/Non_Search_Chrome_REMK.png"
[[2]]
[[2]][[1]]
[1] "Output/outputfiles_2022-11-08_ElasticNet/Non_Search_Cloud_Cloud_-_All_Channel.png"
[[3]]
[[3]][[1]]
[1] "Output/outputfiles_2022-11-08_ElasticNet/Non_Search_DSM_EMEA__DISCOVERY.png"
[[3]][[2]]
[1] "Output/outputfiles_2022-11-08_ElasticNet/Non_Search_DSM_EMEA__DISPLAY.png"
[[3]][[3]]
[1] "Output/outputfiles_2022-11-08_ElasticNet/Non_Search_DSM_EMEA__YOUTUBE.png"
[[3]][[4]]
[1] "Output/outputfiles_2022-11-08_ElasticNet/Non_Search_DSM_NA__DISCOVERY.png"
[[3]][[5]]
[1] "Output/outputfiles_2022-11-08_ElasticNet/Non_Search_DSM_NA__DISPLAY.png"
[[3]][[6]]
[1] "Output/outputfiles_2022-11-08_ElasticNet/Non_Search_DSM_NA__YOUTUBE.png"
[[4]]
[[4]][[1]]
[1] "Output/outputfiles_2022-11-08_ElasticNet/Non_Search_Google_Fi_DISCOVERY.png"
[[4]][[2]]
[1] "Output/outputfiles_2022-11-08_ElasticNet/Non_Search_Google_Fi_DISPLAY.png"
[[4]][[3]]
[1] "Output/outputfiles_2022-11-08_ElasticNet/Non_Search_Google_Fi_YOUTUBE.png"
[[5]]
[[5]][[1]]
[1] "Output/outputfiles_2022-11-08_ElasticNet/Non_Search_Pixel_Pixel_-_All_Channel.png"
[[6]]
[[6]][[1]]
[1] "Output/outputfiles_2022-11-08_ElasticNet/Non_Search_SMB_DISCOVERY.png"
[[6]][[2]]
[1] "Output/outputfiles_2022-11-08_ElasticNet/Non_Search_SMB_DISPLAY.png"
[[6]][[3]]
[1] "Output/outputfiles_2022-11-08_ElasticNet/Non_Search_SMB_YOUTUBE.png"
[[7]]
[[7]][[1]]
[1] "Output/outputfiles_2022-11-08_ElasticNet/Non_Search_SMB-QLead_SMB_-_Q-Lead.png"
[[8]]
[[8]][[1]]
[1] "Output/outputfiles_2022-11-08_ElasticNet/Non_Search_YouTube_APAC.png"
[[8]][[2]]
[1] "Output/outputfiles_2022-11-08_ElasticNet/Non_Search_YouTube_EMEA.png"
[[8]][[3]]
[1] "Output/outputfiles_2022-11-08_ElasticNet/Non_Search_YouTube_NA.png"
lapply(1:length(df_names),
function(j){
subplot(graph.list.rlm[j][[1]], nrows = length(graph.list.rlm[j][[1]]))
}
)[[1]]
[[2]]
[[3]]
[[4]]
[[5]]
[[6]]
[[7]]
[[8]]
lapply(1:length(df_names),
function(i){
#p1 = graph_names[i][[1]]
do.call(grid.arrange,graph.list.rlm[i][[1]])
#return(grid.arrange(grobs = p1))
}
)[[1]]
TableGrob (3 x 1) "arrange": 3 grobs
[[2]]
TableGrob (1 x 1) "arrange": 1 grobs
[[3]]
TableGrob (3 x 2) "arrange": 6 grobs
[[4]]
TableGrob (3 x 1) "arrange": 3 grobs
[[5]]
TableGrob (1 x 1) "arrange": 1 grobs
[[6]]
TableGrob (3 x 1) "arrange": 3 grobs
[[7]]
TableGrob (1 x 1) "arrange": 1 grobs
[[8]]
TableGrob (3 x 1) "arrange": 3 grobs
NA
lapply(1:length(df_names),
function(j){
subplot(graph.list.RIDGE_LASSO[j][[1]], nrows = length(graph.list.RIDGE_LASSO[j][[1]]))
}
)[[1]]
[[2]]
[[3]]
[[4]]
[[5]]
[[6]]
[[7]]
[[8]]
lapply(1:length(df_names),
function(i){
#p1 = graph_names[i][[1]]
do.call(grid.arrange,graph.list.RIDGE_LASSO[i][[1]])
#return(grid.arrange(grobs = p1))
}
)[[1]]
TableGrob (3 x 1) "arrange": 3 grobs
[[2]]
TableGrob (1 x 1) "arrange": 1 grobs
[[3]]
TableGrob (3 x 2) "arrange": 6 grobs
[[4]]
TableGrob (3 x 1) "arrange": 3 grobs
[[5]]
TableGrob (1 x 1) "arrange": 1 grobs
[[6]]
TableGrob (3 x 1) "arrange": 3 grobs
[[7]]
TableGrob (1 x 1) "arrange": 1 grobs
[[8]]
TableGrob (3 x 1) "arrange": 3 grobs
NA
p_load(lme4)
p_load(metaforest)
Final_CLS_2022_Study_List_Non_Search_model_file_dsm_pre <-
Final_CLS_2022_Study_List_Non_Search_model_file %>%
filter(pa == "DSM") %>%
filter(region_v2 != "APAC") %>%
# filter(absolute_lift < 1000) # %>%
# filter(study_id != '6297420') #%>%
# filter(study_id !='149161711') %>%
# filter(study_id != '148613002') %>%
# filter(study_id !='3284625') %>%
# filter(study_id !='3329131')
mutate(
id2 = row_number()
)
df_test <-
Final_CLS_2022_Study_List_Non_Search_model_file_dsm_pre %>%
# select(-study_id, -id2, -region, -scaling_factor, -quarter, -pa, -study_name)
select(
region_v2, country, channel, tactic,
# treatment_user_count:control,
cost_spent_on_exposed_group:absolute_lift
)
iso_dsm <- isolationForest$new(sample_size = nrow(df_test), num_trees = 10000, seed = 1152)
iso_dsm$fit(df_test)INFO [19:29:47.734] Building Isolation Forest ...
INFO [19:29:47.957] done
INFO [19:29:47.967] Computing depth of terminal nodes ...
INFO [19:30:04.726] done
INFO [19:30:04.906] Completed growing isolation forest
scores_train <- df_test %>%
iso_dsm$predict() %>%
arrange(desc(anomaly_score))
Final_CLS_2022_Study_List_Non_Search_model_file_dsm_Meta4_V1 <-
Final_CLS_2022_Study_List_Non_Search_model_file_dsm_pre %>%
left_join(scores_train, by = c("id2" = "id")) %>%
filter(average_depth > 5.209)
Final_CLS_2022_Study_List_Non_Search_model_file_dsm_Meta4 <-
Final_CLS_2022_Study_List_Non_Search_model_file_dsm_Meta4_V1 %>%
mutate(
p1 = exposed/treatment_user_count,
q1 = 1 - p1,
n1 = treatment_user_count,
sd1 = sqrt(p1*q1*n1),
p2 = scaled_control/treatment_user_count,
q2 = 1-p2,
n2 = treatment_user_count,
sd2 = sqrt(p2*q2*n2),
cost_p = cost_spent_on_exposed_group ^ 0.4
) %>%
select(-p1,-q1,-n1,-p2,-q2,-n2) %>%
named_group_split(pa)
df_SMD <- list()
for (i in 1:length(Final_CLS_2022_Study_List_Non_Search_model_file_dsm_Meta4)){
df_SMD[[i]] <-
escalc(
measure = "SMD",
m1i = exposed,
m2i = scaled_control,
sd1i = sd1,
sd2i = sd2,
n1i = treatment_user_count,
n2i = treatment_user_count,
data = Final_CLS_2022_Study_List_Non_Search_model_file_dsm_Meta4[[i]]
)
names(df_SMD)[i] <- names(Final_CLS_2022_Study_List_Non_Search_model_file_dsm_Meta4[i])
}
i = 1
yi_DSM = df_SMD[[i]]['absolute_lift'] %>% unlist()
vi_DSM = df_SMD[[i]]['sd1'] %>% unlist()
split2 = factor(df_SMD[[i]]['channel'] %>% unlist(),labels = unique(df_SMD[[i]]['channel']) %>% unlist())
m_reg <- rma(yi = yi, # The d-column of the df, which contains Cohen's d
vi = vi # The vi-column of the df, which contains the variances
,mods = ~channel:cost_p-1 #to remove intercept between slopes
,data = df_SMD[[i]]
)
m_reg
Mixed-Effects Model (k = 23; tau^2 estimator: REML)
tau^2 (estimated amount of residual heterogeneity): 21.6012 (SE = 6.8309)
tau (square root of estimated tau^2 value): 4.6477
I^2 (residual heterogeneity / unaccounted variability): 100.00%
H^2 (unaccounted variability / sampling variability): 49986503.17
Test for Residual Heterogeneity:
QE(df = 20) = 136662595.7290, p-val < .0001
Test of Moderators (coefficients 1:3):
QM(df = 3) = 25.5645, p-val < .0001
Model Results:
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#rm(Final_CLS_2022_Study_List_Non_Search_model_file_dsm_Meta4_V1)#,mod1_test, i)
predict(m_reg)forest(m_reg, slab = df_SMD[[i]]['study_name'] %>% unlist(), addcred = TRUE)
# Specify basic plot, mapping sex to the x-axis, effect size 'd' to the y-axis,
# and 'weights' to the weight parameter.
df_SMD[[i]] %>%
ggplot()+
aes(
x = cost_spent_on_exposed_group,
y = yi,
size = 1/sqrt(vi)
) +
geom_point(shape = 1) + # Add scatter
geom_abline(intercept = 0, slope = m_reg$b[2]) + # Add regression line
# theme_bw() + # Apply black and white theme
theme(legend.position = "none") # Remove legendDocumentation: * https://pages.stat.wisc.edu/~bates/UseR2008/WorkshopD.pdf
i = 1
mod_lme4 <- lmer(formula = log(absolute_lift) ~ 0 + cost_p
# + region_v2
# + channel
# + (0+ region_v2|channel)
+ (1 + cost_p|channel)
# + (0+ 1|channel)
# + (1+ 1|channel:tactic)
# + (channel|tactic)
# +(cost_p:channel)
,data = df_SMD[[i]], REML = TRUE) #False calls on MLE which are known to be biasedboundary (singular) fit: see ?isSingular
summary(mod_lme4)Linear mixed model fit by REML. t-tests use Satterthwaite's method ['lmerModLmerTest']
Formula: log(absolute_lift) ~ 0 + cost_p + (1 + cost_p | channel)
Data: df_SMD[[i]]
REML criterion at convergence: 88.6
Scaled residuals:
Min 1Q Median 3Q Max
-2.198 -0.150 0.142 0.550 2.074
Random effects:
Groups Name Variance Std.Dev. Corr
channel (Intercept) 15.25205 3.9054
cost_p 0.00111 0.0333 -1.00
Residual 1.48538 1.2188
Number of obs: 23, groups: channel, 3
Fixed effects:
Estimate Std. Error df t value Pr(>|t|)
cost_p 0.04817 0.00316 5.79695 15.3 0.0000067 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
optimizer (nloptwrap) convergence code: 0 (OK)
boundary (singular) fit: see ?isSingular
predict(mod_lme4) 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
5.5910 5.4268 5.3498 4.8830 5.6498 5.8099 4.3085 4.2552 3.8610 4.3744 4.9673 5.9422 6.1465 3.9900 5.1042
16 17 18 19 20 21 22 23
3.6101 5.1936 5.1127 5.8055 4.6467 4.6814 5.6567 4.9926
df_SMD[[i]]$preds <- predict(mod_lme4)
fixef(mod_lme4) cost_p
0.048166
ranef(mod_lme4, drop = FALSE)$channel
(Intercept) cost_p
DISCOVERY 5.90082 -0.0502700
DISPLAY 0.98253 -0.0083704
YOUTUBE 3.49158 -0.0297453
with conditional variances for “channel”
p<-
df_SMD[[i]] %>%
ggplot(aes(x=cost_spent_on_exposed_group, y=preds, group = channel, colour = channel)) +
geom_line() +
labs(x="Spend", y="Absolute Lift") +
ggtitle("Mixed Effects Model") +
# scale_colour_discrete('pa')+
geom_jitter(aes(x=cost_spent_on_exposed_group, y = log(absolute_lift), size = vi ))
p
ggplotly(p)NAhttps://lmudge13.github.io/sample_code/mixed_effects.html
p_load(sjPlot) #for plotting lmer and glmer mods
p_load(sjmisc)
p_load(effects)
p_load(sjstats) #use for r2 functions
sjPlot::plot_model(mod_lme4)`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
sjPlot:: tab_model(mod_lme4)effects_costp <- effects::effect(term= "cost_p", mod= mod_lme4) %>% as.data.frame()
summary(effects_costp) #output of what the values are cost_p fit se lower upper
Min. : 45.0 Min. :2.17 Min. :0.142 Min. :1.87 Min. :2.46
1st Qu.: 66.0 1st Qu.:3.18 1st Qu.:0.208 1st Qu.:2.75 1st Qu.:3.61
Median : 88.0 Median :4.24 Median :0.278 Median :3.66 Median :4.81
Mean : 87.8 Mean :4.23 Mean :0.277 Mean :3.65 Mean :4.80
3rd Qu.:110.0 3rd Qu.:5.30 3rd Qu.:0.347 3rd Qu.:4.58 3rd Qu.:6.02
Max. :130.0 Max. :6.26 Max. :0.410 Max. :5.41 Max. :7.11
ggplot() +
#2
geom_point(data=df_SMD[[i]], aes(cost_p, log(absolute_lift))) +
#3
geom_point(data=effects_costp, aes(x=cost_p, y=fit), color="blue") +
#4
geom_line(data=effects_costp, aes(x=cost_p, y=fit), color="blue") +
#5
geom_ribbon(data= effects_costp, aes(x=cost_p, ymin=lower, ymax=upper), alpha= 0.3, fill="blue") +
#6
labs(x="cost_p", y="Log(Absolute Lift)")